{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "Python 3.6.7 64-bit ('DataAnalysis': conda)",
   "display_name": "Python 3.6.7 64-bit ('DataAnalysis': conda)",
   "metadata": {
    "interpreter": {
     "hash": "07829f70181eff8a51ba0c91cec5a602c5da58bbdadabc96e22cfe4f56b0b078"
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "*** Introductory Examples for the NLTK Book ***\n",
      "Loading text1, ..., text9 and sent1, ..., sent9\n",
      "Type the name of the text or sentence to view it.\n",
      "Type: 'texts()' or 'sents()' to list the materials.\n",
      "text1: Moby Dick by Herman Melville 1851\n",
      "text2: Sense and Sensibility by Jane Austen 1811\n",
      "text3: The Book of Genesis\n",
      "text4: Inaugural Address Corpus\n",
      "text5: Chat Corpus\n",
      "text6: Monty Python and the Holy Grail\n",
      "text7: Wall Street Journal\n",
      "text8: Personals Corpus\n",
      "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
     ]
    }
   ],
   "source": [
    "from tools import *\n",
    "from func import *\n",
    "from nltk.book import *\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap1 语言处理与Python\n",
    "\n",
    "目的：\n",
    "\n",
    "1.  简单的程序如何与大规模的文本结合？\n",
    "2.  如何自动地提取出关键字和词组？如何使用它们来总结文本的风格和内容？\n",
    "3.  Python为文本处理提供了哪些工具和技术？\n",
    "4.  自然语言处理中还有哪些有趣的挑战？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 1.2 将文本当作词链表\n",
    "### 1.2.1 链表"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "sent1=  ['Call', 'me', 'Ishmael', '.']\nsent1 的单词个数=  4\nsent1 中每个单词被使用的次数的平均数=  1.0\n"
     ]
    }
   ],
   "source": [
    "sent1=['Call', 'me', 'Ishmael', '.']\n",
    "print(\"sent1= \", sent1)\n",
    "print(\"sent1 的单词个数= \", len(sent1))\n",
    "print(\"sent1 中每个单词被使用的次数的平均数= \", lexical_diversity(sent1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "sent2=  ['The', 'family', 'of', 'Dashwood', 'had', 'long', 'been', 'settled', 'in', 'Sussex', '.']\nsent1+sent2=  ['Call', 'me', 'Ishmael', '.', 'The', 'family', 'of', 'Dashwood', 'had', 'long', 'been', 'settled', 'in', 'Sussex', '.']\n"
     ]
    }
   ],
   "source": [
    "print(\"sent2= \", sent2)\n",
    "print(\"sent1+sent2= \", sent1+sent2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "sent1=  ['Call', 'me', 'Ishmael', '.']\n--------------- >sent1.append('Some')< ---------------\n['Call', 'me', 'Ishmael', '.', 'Some']\n"
     ]
    }
   ],
   "source": [
    "# 注意append() 函数返回的是运行效果，不是字符串\n",
    "sent1 = ['Call', 'me', 'Ishmael', '.']\n",
    "print(\"sent1= \", sent1)\n",
    "show_subtitle(\"sent1.append('Some')\")\n",
    "sent1.append('Some')\n",
    "print(sent1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "sent1=  ['Call', 'me', 'Ishmael', '.']\n--------------- >sent1.append(sent2)< ---------------\n['Call', 'me', 'Ishmael', '.', ['The', 'family', 'of', 'Dashwood', 'had', 'long', 'been', 'settled', 'in', 'Sussex', '.']]\n"
     ]
    }
   ],
   "source": [
    "sent1 = ['Call', 'me', 'Ishmael', '.']\n",
    "print(\"sent1= \", sent1)\n",
    "sent1.append(sent2)\n",
    "show_subtitle(\"sent1.append(sent2)\")\n",
    "print(sent1)"
   ]
  },
  {
   "source": [
    "### 1.2.2 索引列表"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "text4[173]=  awaken\ntext4.index('awaken')=  173\n"
     ]
    }
   ],
   "source": [
    "# 索引列表\n",
    "print(\"text4[173]= \", text4[173])\n",
    "print(\"text4.index('awaken')= \", text4.index('awaken'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >text5[16715:16735]< ---------------\n['U86', 'thats', 'why', 'something', 'like', 'gamefly', 'is', 'so', 'good', 'because', 'you', 'can', 'actually', 'play', 'a', 'full', 'game', 'without', 'buying', 'it']\n--------------- >text6[1600:1625]< ---------------\n['We', \"'\", 're', 'an', 'anarcho', '-', 'syndicalist', 'commune', '.', 'We', 'take', 'it', 'in', 'turns', 'to', 'act', 'as', 'a', 'sort', 'of', 'executive', 'officer', 'for', 'the', 'week']\n"
     ]
    }
   ],
   "source": [
    "# 索引切片\n",
    "show_subtitle(\"text5[16715:16735]\")\n",
    "print(text5[16715:16735])\n",
    "show_subtitle(\"text6[1600:1625]\")\n",
    "print(text6[1600:1625])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "sent[0]=  word1\nsent[9]=  word10\n"
     ]
    }
   ],
   "source": [
    "# 索引从零开始\n",
    "sent = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10']\n",
    "print(\"sent[0]= \", sent[0])\n",
    "print(\"sent[9]= \", sent[9])\n",
    "# print(\"sent[10]= \", sent[10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "sent[5:8]=  ['word6', 'word7', 'word8']\nsent[5]=  word6\nsent[7]=  word8\n"
     ]
    }
   ],
   "source": [
    "# 索引切片\n",
    "print(\"sent[5:8]= \", sent[5:8])\n",
    "print(\"sent[5]= \", sent[5])\n",
    "print(\"sent[7]= \", sent[7])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "sent[:3]=  ['word1', 'word2', 'word3']\nsent[-1:]=  ['word10']\nsent[:-1]=  ['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9']\n"
     ]
    }
   ],
   "source": [
    "# 索引切片\n",
    "print(\"sent[:3]= \", sent[:3])\n",
    "print(\"sent[-1:]= \", sent[-1:])\n",
    "print(\"sent[:-1]= \", sent[:-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "text2[141525:]=  ['among', 'the', 'merits', 'and', 'the', 'happiness', 'of', 'Elinor', 'and', 'Marianne', ',', 'let', 'it', 'not', 'be', 'ranked', 'as', 'the', 'least', 'considerable', ',', 'that', 'though', 'sisters', ',', 'and', 'living', 'almost', 'within', 'sight', 'of', 'each', 'other', ',', 'they', 'could', 'live', 'without', 'disagreement', 'between', 'themselves', ',', 'or', 'producing', 'coolness', 'between', 'their', 'husbands', '.', 'THE', 'END']\n"
     ]
    }
   ],
   "source": [
    "# 索引切片\n",
    "print(\"text2[141525:]= \", text2[141525:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "len(sent):  10\nsent[1:9]=  ['word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9']\n"
     ]
    }
   ],
   "source": [
    "# 索引大小\n",
    "sent = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10']\n",
    "sent[0] = 'First'\n",
    "sent[9] = 'Last'\n",
    "print(\"len(sent): \", len(sent))\n",
    "print(\"sent[1:9]= \", sent[1:9])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "sent=  ['First', 'Second', 'Third', 'Last']\nlen(sent)=  4\n"
     ]
    }
   ],
   "source": [
    "# 索引大小\n",
    "sent[1:9]=['Second', 'Third']\n",
    "print(\"sent= \", sent)\n",
    "print(\"len(sent)= \", len(sent))\n",
    "# print(\"sent[9]= \", sent[9])"
   ]
  },
  {
   "source": [
    "### 1.2.3 变量"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "noun_phrase=  ['bold', 'Sir', 'Robin']\n"
     ]
    }
   ],
   "source": [
    "# 链表变量与赋值\n",
    "sent1 = ['Call', 'me', 'Ishmael', '.']\n",
    "my_sent = ['Bravely', 'bold', 'Sir', 'Robin', ',', 'rode']\n",
    "noun_phrase = my_sent[1:4]\n",
    "print(\"noun_phrase= \", noun_phrase)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >sorted(noun_phrase)< ---------------\n['Robin', 'Sir', 'bold']\n"
     ]
    }
   ],
   "source": [
    "# 链表排序：大写字母 排在 小写字母 前面\n",
    "w0rDs=sorted(noun_phrase)\n",
    "show_subtitle(\"sorted(noun_phrase)\")\n",
    "print(w0rDs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 变量名不能使用 Python 的保留字\n",
    "# not='Camelot'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >len(set(text1))< ---------------\nvocab_size=  19317\n"
     ]
    }
   ],
   "source": [
    "vocab=set(text1)\n",
    "vocab_size=len(vocab)\n",
    "show_subtitle(\"len(set(text1))\")\n",
    "print(\"vocab_size= \",vocab_size)"
   ]
  },
  {
   "source": [
    "### 1.2.4 字符串\n",
    "\n",
    "字符串的访问与链表相同。"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "name=  Monty\nname[0]=  M\nname[:4]=  Mont\nname*2=  MontyMonty\nname+'!'=  Monty!\n"
     ]
    }
   ],
   "source": [
    "# 字符串处理\n",
    "name='Monty'\n",
    "# string[0] = 'm'  # TypeError: 'str' object does not support item assignment\n",
    "\n",
    "print(\"name= \",name)\n",
    "print(\"name[0]= \",name[0])\n",
    "print(\"name[:4]= \",name[:4])\n",
    "print(\"name*2= \",name*2)\n",
    "print(\"name+'!'= \",name+'!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "' '.join(['Monty', 'Python'])=  Monty Python\n'Monty Python'.split()=  ['Monty', 'Python']\n"
     ]
    }
   ],
   "source": [
    "# 字符串函数\n",
    "print(\"' '.join(['Monty', 'Python'])= \",' '.join(['Monty', 'Python']))\n",
    "print(\"'Monty Python'.split()= \", 'Monty Python'.split())"
   ]
  }
 ]
}